rm(list = ls())
library('rstan')
library('bayesplot')
library('ggplot2')
library('BART')
bayesplot_theme_set(theme_default(base_size = 24, base_family = "sans"))
# edited to add "escs" and "wealth" to the data
data_full <- readRDS("./data/cleaned/cleaned_data.Rds")
# only consider AUS - for better balance between treatment/control groups
data_AUS <- data_full[data_full$country == 'AUS', ]
table(data_AUS$public_private)
##
## private public
## 5210 6596
head(data_AUS)
## # A tibble: 6 × 14
## country school_id student_id gender computer internet math read science
## <fct> <fct> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl>
## 1 AUS 3600002 3600210 female yes yes 541. 611. 530.
## 2 AUS 3600002 3602562 female yes yes 431. 469. 454.
## 3 AUS 3600002 3602765 male yes yes 617. 642. 659.
## 4 AUS 3600002 3603797 female yes yes 359. 449. 401.
## 5 AUS 3600002 3606990 female yes yes 475. 440. 443.
## 6 AUS 3600002 3608618 female yes yes 436. 498. 420.
## # ℹ 5 more variables: stu_wgt <dbl>, wealth <dbl>, escs <dbl>,
## # public_private <fct>, family_class <chr>
data <- data_AUS[, c("gender", "computer", "internet", "math", "read", "science", "public_private", "family_class", "wealth", "escs", "stu_wgt")]
# one-hot encoding of binary variables
data$gender <- as.numeric(data$gender == 'female')
data$computer <- as.numeric(data$computer == "yes")
data$internet <- as.numeric(data$internet == "yes")
data$public_private <- as.numeric(data$public_private == "private")
data$family_class <- as.numeric((data$family_class == "middle class"))
# combine "computer" and "internet" into "tech_access"
data$tech_access <- data$computer * data$internet
# log transformation of the scores - will transorm back when making predictions
data$math_log <- log(data$math)
data$read_log <- log(data$read)
data$science_log <- log(data$science)
head(data)
## # A tibble: 6 × 15
## gender computer internet math read science public_private family_class
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 541. 611. 530. 1 1
## 2 1 1 1 431. 469. 454. 1 1
## 3 0 1 1 617. 642. 659. 1 1
## 4 1 1 1 359. 449. 401. 1 1
## 5 1 1 1 475. 440. 443. 1 1
## 6 1 1 1 436. 498. 420. 1 1
## # ℹ 7 more variables: wealth <dbl>, escs <dbl>, stu_wgt <dbl>,
## # tech_access <dbl>, math_log <dbl>, read_log <dbl>, science_log <dbl>
X <- data[, c(1, 8:10, 12)]
X[,"Offset"] <- 1
t <- data$public_private
treat_ind <- data$public_private==1
control_ind <- data$public_private==0
X_treat <- X[treat_ind, ]
X_control <- X[control_ind, ]
head(X_treat)
## # A tibble: 6 × 6
## gender family_class wealth escs tech_access Offset
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0.852 0.101 1 1
## 2 1 1 0.664 -0.793 1 1
## 3 0 1 -0.267 0.166 1 1
## 4 1 1 1.31 0.726 1 1
## 5 1 1 0.901 -0.170 1 1
## 6 1 1 1.10 1.21 1 1
head(X_control)
## # A tibble: 6 × 6
## gender family_class wealth escs tech_access Offset
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0 0.168 -0.170 1 1
## 2 0 0 -0.887 -0.337 1 1
## 3 0 1 0.536 -1.13 1 1
## 4 1 0 0.541 -0.898 1 1
## 5 0 1 0.146 0.769 1 1
## 6 0 1 -1.25 -1.41 0 1
element_prod <- function(input_matrix, vector){
row_num = nrow(input_matrix)
col_num = ncol(input_matrix)
output_matrix = matrix(nrow=row_num, ncol=col_num)
for (i in 1:row_num){
output_matrix[i, ] = input_matrix[i, ] * vector
}
output_matrix
}
# # outcome model 1 - bayesian linear regression
# outcome_normal_code = "
# data {
# int<lower=0> N; // number of data items
# int<lower=0> N_test; // number of test data items
# int<lower=0> K; // number of predictors
# real<lower=0> pr_sd; // std dev of the prior
# matrix[N, K] x; // predictor matrix
# real y[N]; // output vector
# matrix[N_test, K] x_test; // predictor matrix
# }
#
# parameters {
# vector[K] beta; // coefficients for predictors
# real<lower=0> sigma; // error scale
# }
#
# transformed parameters {
# vector[N] mu = x * beta;
# }
#
# model {
# beta ~ normal(0,pr_sd); // Note: beta is k-dim
# sigma ~ inv_gamma(0.001,0.001); // Close to a flat prior
# y ~ normal(mu,sigma); // likelihood
# }
#
# generated quantities {
# real y_rep[N_test];
# y_rep = normal_rng(x_test * beta, sigma);
# } "
#
# outcome_normal = stan_model(model_code=outcome_normal_code)
# propensity score model
ps_logis_code = "
data {
int<lower=0> N; // number of data items
int<lower=0> K; // number of predictors
real<lower=0> pr_sd; // std dev of the prior
matrix[N, K] x; // predictor matrix
int<lower=0, upper=1> y[N]; // Binary outcome variable
}
parameters {
vector[K] beta; // Regression coefficients
}
transformed parameters {
vector[N] mu = x * beta;
}
model {
// Prior for beta
beta ~ normal(0, pr_sd);
// Likelihood
y ~ bernoulli_logit(mu); // Logistic regression likelihood
}
generated quantities {
int<lower=0, upper=1> y_rep[N];
y_rep = bernoulli_logit_rng(mu);
} "
ps_logis = stan_model(model_code=ps_logis_code)
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/Rcpp/include/" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/unsupported" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/src/" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppParallel/include/" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fPIC -Wall -g -O2 -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
## #include <cmath>
## ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1
dr_estimate <- function(y, t, y0_pred, y1_pred, ps){
y_pred = element_prod(y0_pred, 1-t) + element_prod(y1_pred, t)
ps_pred = element_prod(1-ps, 1-t) + element_prod(ps, t)
temp1 = y1_pred - y0_pred
temp2 = y - y_pred
dr = temp1 + temp2/ps_pred
}
ps_data <- list(N = nrow(X), K = ncol(X), pr_sd = 10, x=X, y=t)
ps_fit <- sampling(ps_logis, data=ps_data, iter=5000, warmup=4000, chains=1)
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.00074 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 7.4 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 5000 [ 0%] (Warmup)
## Chain 1: Iteration: 500 / 5000 [ 10%] (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%] (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%] (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%] (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%] (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%] (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%] (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%] (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%] (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%] (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 37.971 seconds (Warm-up)
## Chain 1: 12.484 seconds (Sampling)
## Chain 1: 50.455 seconds (Total)
## Chain 1:
ps <- 1/(1 + exp(-extract(ps_fit)$mu))
# check mixing
class(ps_fit)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(ps_fit)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
#hist(post_smp$TV,50)
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)
plot(post_smp$gender, type='l')
plot(post_smp$family_class, type='l')
plot(post_smp$tech_access, type='l')
plot(post_smp$wealth, type='l')
plot(post_smp$escs, type='l')
# predictive check on the propensity score model
y_rep = extract(ps_fit)$y_rep
# One posterior predictive check compares the number of 0's and 1's in the observed vs predictive datasets
ppc_bars(y=t,yrep=y_rep)+ggplot2::labs(y = "Number of 0's and 1's in observed and Predictive datasets")
# Another just plots the proportions of 1's
ppc_stat(y=t,yrep=y_rep)+ggplot2::labs(x = "Proportion of 1's in observed and Predictive datasets")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
y <- data$math_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]
### bayesian linear regression model
# math_outcome_treat <- list(N = nrow(X_treat),
# N_test = nrow(X),
# K = ncol(X_treat),
# pr_sd = 10,
# x = X_treat,
# y = y_treat,
# x_test = X)
# math_outcome_treat_regress <- sampling(outcome_normal, data=math_outcome_treat, iter=5000, warmup=4000, chains=4)
# # check mixing
# class(math_outcome_treat_regress)
# post_smp <- as.data.frame(math_outcome_treat_regress)
# colnames(post_smp)[1:5] <- colnames(X)
#
# mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)
#
# plot(post_smp$gender, type='l')
# plot(post_smp$family_class, type='l')
# plot(post_smp$tech_access, type='l')
# # predictive check
# y1_pred <- extract(math_outcome_treat_regress)$y_rep
#
# ppd_intervals(y1_pred[ ,treat_ind], x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1) +
# ggplot2::labs(y = "Predicted y's", x = "Observed y's")
#
# ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0) +
# ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")
math_outcome_treat <- wbart(x.train=as.matrix(X_treat[1:5]),
y.train = y_treat,
x.test=as.matrix(X[1:5]),
w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.072444, -0.096642
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.032790,3.000000,0.005912
## *****sigma: 0.174221
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 38s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y1_pred <- math_outcome_treat$yhat.test
ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1) +
ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Math: Predictive check for treatment group")
ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0) +
ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Math: Predictive check for control group")
BART is slightly better than bayesian linear regression model (smaller variance) - use BART as outcome modeling.
math_outcome_control <- wbart(x.train=as.matrix(X_control[1:5]),
y.train = y_control,
x.test=as.matrix(X[1:5]),
w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: -0.034944, 0.102562
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.030787,3.000000,0.006994
## *****sigma: 0.189487
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 51s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y0_pred <- math_outcome_control$yhat.test
ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1) +
ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Math: Predictive check for control group")
ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0) +
ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Math: Predictive check for control group")
y <- data$read_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]
read_outcome_treat <- wbart(x.train=as.matrix(X_treat[1:5]),
y.train = y_treat,
x.test=as.matrix(X[1:5]),
w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.171592, -0.356653
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.028634,3.000000,0.007594
## *****sigma: 0.197444
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 44s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y1_pred <- read_outcome_treat$yhat.test
ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1) +
ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for treatment group")
ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0) +
ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Read: Predictive check for control group")
read_outcome_control <- wbart(x.train=as.matrix(X_control[1:5]),
y.train = y_control,
x.test=as.matrix(X[1:5]),
w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: 0.158013, -0.005358
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.034070,3.000000,0.009718
## *****sigma: 0.223363
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 51s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y0_pred <- read_outcome_control$yhat.test
ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1) +
ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for control group")
ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0) +
ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Read: Predictive check for control group")
y <- data$science_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]
science_outcome_treat <- wbart(x.train=as.matrix(X_treat[1:5]),
y.train = y_treat,
x.test=as.matrix(X[1:5]),
w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.030477, -0.223005
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.029797,3.000000,0.006803
## *****sigma: 0.186884
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 41s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y1_pred <- science_outcome_treat$yhat.test
ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1) +
ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for treatment group")
ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0) +
ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Read: Predictive check for control group")
science_outcome_control <- wbart(x.train=as.matrix(X_control[1:5]),
y.train = y_control,
x.test=as.matrix(X[1:5]),
w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: -0.079194, -0.131055
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.028021,3.000000,0.008300
## *****sigma: 0.206423
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 49s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# predictive check
y0_pred <- science_outcome_control$yhat.test
ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1) +
ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for control group")
ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0) +
ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Read: Predictive check for control group")
y <- data$math
y0_pred <- exp(math_outcome_control$yhat.test)
y1_pred <- exp(math_outcome_treat$yhat.test)
causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)
group1_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==1)
group2_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==1)
group3_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==1)
group4_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==1)
group5_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==0)
group6_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==0)
group7_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==0)
group8_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==0)
cate1 <- rowMeans(causal_contrasts[, group1_ind])
cate2 <- rowMeans(causal_contrasts[, group2_ind])
cate3 <- rowMeans(causal_contrasts[, group3_ind])
cate4 <- rowMeans(causal_contrasts[, group4_ind])
cate5 <- rowMeans(causal_contrasts[, group5_ind])
cate6 <- rowMeans(causal_contrasts[, group6_ind])
cate7 <- rowMeans(causal_contrasts[, group7_ind])
cate8 <- rowMeans(causal_contrasts[, group8_ind])
cate_math <- data.frame(cate1, cate2, cate3, cate4, cate5, cate6, cate7, cate8)
colnames(cate_math) <- c("female, middle, tech",
"male, middle, tech",
"female, working, tech",
"male, working, tech",
"female, middle, no_tech",
"male, middle, no_tech",
"female, working, no_tech",
"male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_math, horizontal=TRUE, las=1, main="Math: Conditional Average Treatment Effect")
#boxplot(cate, las=2)
y <- data$read
y0_pred <- exp(read_outcome_control$yhat.test)
y1_pred <- exp(read_outcome_treat$yhat.test)
causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)
group1_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==1)
group2_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==1)
group3_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==1)
group4_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==1)
group5_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==0)
group6_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==0)
group7_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==0)
group8_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==0)
cate1 <- rowMeans(causal_contrasts[, group1_ind])
cate2 <- rowMeans(causal_contrasts[, group2_ind])
cate3 <- rowMeans(causal_contrasts[, group3_ind])
cate4 <- rowMeans(causal_contrasts[, group4_ind])
cate5 <- rowMeans(causal_contrasts[, group5_ind])
cate6 <- rowMeans(causal_contrasts[, group6_ind])
cate7 <- rowMeans(causal_contrasts[, group7_ind])
cate8 <- rowMeans(causal_contrasts[, group8_ind])
cate_read <- data.frame(cate1, cate2, cate3, cate4, cate5, cate6, cate7, cate8)
colnames(cate_read) <- c("female, middle, tech",
"male, middle, tech",
"female, working, tech",
"male, working, tech",
"female, middle, no_tech",
"male, middle, no_tech",
"female, working, no_tech",
"male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_read, horizontal=TRUE, las=1, main="Read: Conditional Average Treatment Effect")
#boxplot(cate, las=2)
##science
y <- data$science
y0_pred <- exp(science_outcome_control$yhat.test)
y1_pred <- exp(science_outcome_treat$yhat.test)
causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)
group1_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==1)
group2_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==1)
group3_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==1)
group4_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==1)
group5_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==0)
group6_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==0)
group7_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==0)
group8_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==0)
cate1 <- rowMeans(causal_contrasts[, group1_ind])
cate2 <- rowMeans(causal_contrasts[, group2_ind])
cate3 <- rowMeans(causal_contrasts[, group3_ind])
cate4 <- rowMeans(causal_contrasts[, group4_ind])
cate5 <- rowMeans(causal_contrasts[, group5_ind])
cate6 <- rowMeans(causal_contrasts[, group6_ind])
cate7 <- rowMeans(causal_contrasts[, group7_ind])
cate8 <- rowMeans(causal_contrasts[, group8_ind])
cate_science <- data.frame(cate1, cate2, cate3, cate4, cate5, cate6, cate7, cate8)
colnames(cate_science) <- c("female, middle, tech",
"male, middle, tech",
"female, working, tech",
"male, working, tech",
"female, middle, no_tech",
"male, middle, no_tech",
"female, working, no_tech",
"male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_science, horizontal=TRUE, las=1, main="Read: Conditional Average Treatment Effect")
#boxplot(cate, las=2)
hist(X_treat$wealth, col = rgb(0, 0, 1, 0.5), xlim = c(-7, 7), ylim=c(0, 1600),
main = "wealth", xlab = "wealth", breaks = 30)
hist(X_control$wealth, col = rgb(1, 0, 0, 0.5), add = TRUE, breaks = 30)
legend("topright", legend = c("private", "public"),
fill = c(rgb(0, 0, 1, 0.5), rgb(1, 0, 0, 0.5)))
hist(X_treat$escs, col = rgb(0.2, 0.8, 0.2, 0.5), xlim = c(-7, 7), ylim=c(0, 700),
main = "escs", xlab = "escs", breaks = 30)
hist(X_control$escs, col = rgb(1, 0.6, 0, 0.5), add = TRUE, breaks = 30)
legend("topright", legend = c("private", "public"),
fill = c(rgb(0.2, 0.8, 0.2, 0.5), rgb(1, 0.6, 0, 0.5)))
colors = c(rgb(0, 0, 1, 0.5), rgb(0, 1, 1, 0.5))
variables <- c("gender", "family_class", "tech_access")
group <- c("private", "public")
# Create the matrix of the values.
Values <- matrix(c(sum(X_treat$gender)/nrow(X_treat),
sum(X_treat$family_class)/nrow(X_treat),
sum(X_treat$tech_access)/nrow(X_treat),
sum(X_control$gender)/nrow(X_control),
sum(X_control$family_class)/nrow(X_control),
sum(X_control$tech_access)/nrow(X_control)),
nrow = 2, ncol = 3, byrow = TRUE)
# Create the bar chart
barplot(Values, main = "gender, family_class, tech_access", names.arg = variables,
xlab = "variables", ylab = "percentage",
col = colors, beside = TRUE)
# Add the legend to the chart
legend("topleft", group, cex = 0.7, fill = colors)
num <- c(sum(group1_ind),
sum(group2_ind),
sum(group3_ind),
sum(group4_ind),
sum(group5_ind),
sum(group6_ind),
sum(group7_ind),
sum(group8_ind))
label <- c("female, middle, tech",
"male, middle, tech",
"female, working, tech",
"male, working, tech",
"female, middle, no_tech",
"male, middle, no_tech",
"female, working, no_tech",
"male, working, no_tech")
par(mar = c(0.5, 0.5, 0.5, 0.5))
pie(num, label, radius=0.8)